import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(51)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/24 02:30:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:> (0 + 1) / 1] 25/09/24 02:30:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
fig = px.box( pdf, x="NAICS2_NAME", y="SALARY", title="Salary Distribution by Industry", color_discrete_sequence=["purple"], points="outliers",)fig.update_layout( font_family="Times New Roman", title_font_size=16, xaxis_title="Industry", yaxis_title="Salary", xaxis_tickangle=45,)fig.show()fig.write_html("Q1a.html")#fig.write_image("Q1a.png")
#Analysis: This box plot depicts how salaries vary across industries. Industries like Arts, Entertainment, and Recreation have a wider box, highlighting more variabilty in average salary, but with fewer outliers, most salaries stay within a predictable range. Unlike in Health Care and Social services, the smaller box shows consistency in typical salaries, but the presence of several outlies (above and below the median), indicate some employees make more or less than typical. It remarks on how different industries and their subsectors can affect the chances of making around average salary or not.
#Question 1b - Salary Distribution by Employment Type
fig = px.box( pdf, x="EMPLOYMENT_TYPE_NAME", y="SALARY", title="Salary Distribution by Employment Type", color_discrete_sequence=["orange"], points="outliers",)fig.update_layout( font_family="Times New Roman", title_font_size=16, xaxis_title="Employment Type", yaxis_title="Salary", xaxis_tickangle=45,)fig.show()fig.write_html("Q1b.html")#fig.write_image("Q1b.png")
#Analysis: Despite all three having outliers, Full-time employees have more consistency in salary expectations than part-time employees as their hours are set while parttime might fluctate based on the needs of the business. Because of the increased variability, there is more job security and stability in full-time roles over part-time roles.
fig = px.box( pdf, x="NAICS2_NAME", y="SALARY_FROM", color="EMPLOYMENT_TYPE_NAME", points="outliers", title="Salary Distribution by Industry and Employment Type", color_discrete_sequence=["purple", "orange", "green"])fig.update_layout( font_family="Times New Roman", title_font_size=16, xaxis_title="Industry", yaxis_title="Salary", xaxis_tickangle=45, boxmode="group"# <-- ensures grouping by employment type)fig.show()
#Question 2 - Salary Analysis by ONET Occupation Type
saonet = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS Occupation_Name, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM job_postings GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")saonet_pd = saonet.toPandas()saonet_pd.head()fig = px.scatter( saonet_pd, x="Occupation_Name", y="Median_Salary", size="Job_Postings", title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"LOT_OCCUPATION_NAME": "LOT Occupation","Median_Salary": "Median Salary","Job_Postings": "Number of Job Postings" }, hover_name="Occupation_Name", size_max=60, width=1600, height=600, color="Job_Postings", color_continuous_scale="Plasma")fig.update_layout( font_family="Times New Roman", font_size=16, title_font_size=20, xaxis_title="LOT Occupation", yaxis_title="Median Salary", xaxis=dict( tickangle=-45, showline=True, linecolor="black" ), yaxis=dict( showline=True, linecolor="black" ))fig.show()fig.write_html("Q2.html")
[Stage 6:> (0 + 1) / 1]
#Analysis: The chart depicts how variation in median salary and earning potential can be narrow in some occupations and widespread in others. For example Computer Systems Engineer/ Architect have the highest median salary but has a small range for salary variation, while Data/ Data Mining Analyst have a lower median salary but wider range for salary variation.
#Analysis: Although there is a positive relationship between the years of experience (YOE) and salary for both education groups, it is evident that those with Higher degrees (Master’s and PhDs) at all YOEs make more than the Lower degrees (GED, Associate, Bachelor’s). This highlights how advanced education correlates to higher pay.
#Question 4 - Salary by Remote Work type
df = df.withColumn("REMOTE_GROUP", when(col("REMOTE_TYPE_NAME").rlike("(?i)^Remote$"), "Remote") .when(col("REMOTE_TYPE_NAME").rlike("(?i)^Hybrid Remote$"), "Hybrid") .when(col("REMOTE_TYPE_NAME").isNull() | col("REMOTE_TYPE_NAME").rlike("(?i)^Not Remote$"), "Onsite") .otherwise("Other"))# --- Step 2: Keep numeric columns as float & filter valid rows ---df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))df = df.filter( col("MAX_YEARS_EXPERIENCE").isNotNull() & (col("MAX_YEARS_EXPERIENCE")>0) & col("Average_Salary").isNotNull() & (col("Average_Salary")>0))# --- Step 3: Filter only the main three remote types ---df_filtered = df.filter(col("REMOTE_GROUP").isin(["Remote", "Hybrid", "Onsite"]))# --- Step 4: Convert to Pandas ---df_pd = df_filtered.toPandas()fig = px.scatter( df_pd, x="MAX_YEARS_EXPERIENCE", y="Average_Salary", color="REMOTE_GROUP", hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"], title="Experience vs Salary by Remote Work Type", opacity=0.7, color_discrete_sequence=["yellow", "magenta", "blue"])fig.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))fig.update_layout( font_family="Times New Roman", title_font=dict(size=20), font=dict(size=16), xaxis_title="Years of Experience", yaxis_title="Average Salary ($)", legend_title="Remote Work Type", xaxis=dict(gridcolor="lightblue", tickmode="linear", dtick=1), yaxis=dict(gridcolor="lightblue"))fig.show()fig.write_html("Q4.html")
[Stage 10:> (0 + 1) / 1]
#Analysis: Both Remote and Hybrid roles have good average salaries that are similar and sometimes exceed those of onsite positions, particularly in more mid level and senior roles. This shows that there is no decuction in salary from working remote, and that with more years of expereince, the roles lean towards remote expectations.
fig = px.histogram( df_pd, x="Average_Salary", color="REMOTE_GROUP", barmode="overlay", #bars are layed unto of each other nbins=30, #each bin for bar is $10K opacity=0.7, color_discrete_sequence=["yellow", "magenta", "blue"], title="Salary Distribution by Remote Work Type - bar graph")# Update layoutfig.update_layout( font_family="Times New Roman", title_font=dict(size=20), font=dict(size=16), xaxis_title="Average Salary ($)", yaxis_title="Count", legend_title="Remote Work Type", xaxis=dict(gridcolor="lightblue"), yaxis=dict(gridcolor="lightblue"))fig.show()fig.write_html("Q4Histogram.html")